
import logging
from init import PATHS
LOGGER = logging.getLogger(__name__)
import os
import pandas as pd
import numpy as np
from C_PatentVariables import collecting_patstat, family_aggregation, firm_year_aggregation


def collect_nace_codes_autoindustry(test):
    LOGGER.info('Collect appln_id with nace codes 29.1 or 29.3')
    # 29.1  Manufacture of Motor Vehicles
    # 29.3  Manufacture of Parts and Accessories for Motor Vehicles
    list_files = [f for f in os.listdir(PATHS.patstatglobal) if 'tls229' in f]
    df_nace = []
    for file in list_files:
        LOGGER.info('       {}'.format(file))
        for num, dfchunk in enumerate(pd.read_csv(PATHS.patstatglobal / file, chunksize=500000, sep=',', low_memory=False)):
            mask1 = dfchunk['nace2_code'].isin([29.1, 29.3])
            dfchunk = dfchunk[mask1]
            df_nace.append(dfchunk)
            if test:
                break
    df_nace = pd.concat(df_nace)
    list_appln_id = df_nace['appln_id'].drop_duplicates().tolist()
    return list_appln_id


def main(test=False):
    LOGGER.info('SCRIPT: e_patent_nace_autoindustry.py')
    LOGGER.info("       Collect info for patents that fall into auto industry nace")
    # Get list of appln_ids and docdb_ids that are associated with these codes
    list_appln_id = collect_nace_codes_autoindustry(test)
    # Just to be sure, let's do a first pass through patstat to collect all the docdb ids corresponding to the appln_ids I have
    list_appln_id, list_docdb_id = collecting_patstat.collecting_ids(list_appln_id, [], test)
    namefile = 'nace_motor'
    LOGGER.info("       Collect info at application level")
    collecting_patstat.collecting_info_on_applnids(list_appln_id, list_docdb_id, namefile, test, small=True)
    LOGGER.info("       Aggregating patents at family level")
    family_aggregation.aggregate_at_familylevel(namefile)
    LOGGER.info("       Collect all ipc and cpc codes of patents and aggregate them at family level")
    collecting_patstat.extract_and_save_cpc_ipc_codes(namefile, test)
    LOGGER.info("       Collect naces codes associated to patents and aggregate them at family level")
    collecting_patstat.extract_and_save_nace_codes(namefile, test)
    LOGGER.info("       Collect psn_sector of applicants associated to patents and aggregate them at family level")
    collecting_patstat.extract_and_save_psnsector(list_appln_id, namefile, test)
    LOGGER.info("       AGGREGATION YEAR LEVEL")
    namefile = 'nace_motor'
    df_agg_corres = pd.read_csv(PATHS.dropbox / 'Data_outputted/C_PatentVariables/Families_of_{}.csv'.format(namefile))
    # Just a trick to reuse the code developed to output data at firm-year. pretend this is one firm
    df_agg_corres['firm_agg_id'] = 1
    df_year_docdbid = df_agg_corres[['firm_agg_id', 'docdb_family_id', 'earliest_filing_year']].drop_duplicates()
    df_year_docdbid = df_year_docdbid.rename(columns={'earliest_filing_year': 'year_agg'})
    df_year = firm_year_aggregation.counting_families(df_year_docdbid, namefile,  'Count')
    df_year = df_year.reset_index().rename(columns={'year_agg': 'earliest_filing_year'}).drop(columns='firm_agg_id')
    df_year.to_csv(PATHS.dropbox / 'Data_outputted/C_PatentVariables/TimeSeries_{}_FamInfo.csv'.format(namefile), index=False)
    LOGGER.info('TimeSeries_{}_FamInfo.csv Saved'.format(namefile))
    LOGGER.info('SCRIPT END: e_patent_nace_autoindustry.py')
    return

